library(formattable)
library(data.table)
library(stringr)
library(ggmap)
library(tmaptools)
library(dplyr)
library(lubridate)
library(ggplot2)
library(ggrepel)
library(tidyverse)
library(osmdata)
library(sf)
library(rgeos)
library(rgdal)
library(geojsonsf)
manhattan <- read.csv("https://raw.githubusercontent.com/lcao21/GIS3_Final_Project/master/Data/manhattan.csv", stringsAsFactors=FALSE)
formattable(head(manhattan))
| BOROUGH | NEIGHBORHOOD | BUILDING.CLASS.CATEGORY | TAX.CLASS.AT.PRESENT | BLOCK | LOT | EASE.MENT | BUILDING.CLASS.AT.PRESENT | ADDRESS | APARTMENT.NUMBER | ZIP.CODE | RESIDENTIAL.UNITS | COMMERCIAL.UNITS | TOTAL.UNITS | LAND.SQUARE.FEET | GROSS.SQUARE.FEET | YEAR.BUILT | TAX.CLASS.AT.TIME.OF.SALE | BUILDING.CLASS.AT.TIME.OF.SALE | SALE.PRICE | SALE.DATE |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | ALPHABET CITY | 01 ONE FAMILY DWELLINGS | 1 | 376 | 43 | NA | S1 | 743 EAST 6TH STREET | 10009 | 1 | 1 | 2 | 2,090 | 3,680 | 1940 | 1 | S1 | 3,200,000 | 7/24/19 | |
| 1 | ALPHABET CITY | 01 ONE FAMILY DWELLINGS | 1 | 390 | 61 | NA | A4 | 189 EAST 7TH STREET | 10009 | 1 | 0 | 1 | 987 | 2,183 | 1860 | 1 | A4 | 0 | 9/25/19 | |
| 1 | ALPHABET CITY | 02 TWO FAMILY DWELLINGS | 1 | 404 | 1 | NA | B9 | 166 AVENUE A | 10009 | 2 | 0 | 2 | 1,510 | 4,520 | 1900 | 1 | B9 | 0 | 7/22/19 | |
| 1 | ALPHABET CITY | 03 THREE FAMILY DWELLINGS | 1 | 377 | 56 | NA | C0 | 263 EAST 7TH STREET | 10009 | 3 | 0 | 3 | 2,430 | 3,600 | 1899 | 1 | C0 | 6,300,000 | 4/30/19 | |
| 1 | ALPHABET CITY | 03 THREE FAMILY DWELLINGS | 1 | 393 | 9 | NA | C0 | 604 EAST 11TH STREET | 10009 | 3 | 0 | 3 | 2,375 | 5,110 | 1939 | 1 | C0 | 0 | 10/24/19 | |
| 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2 | 372 | 23 | NA | C1 | 300 EAST 3RD STREET | 10009 | 12 | 0 | 12 | 2,393 | 7,989 | 2001 | 2 | C1 | 1,950,000 | 8/8/19 |
drops <- c("BOROUGH","EASE.MENT", "TAX.CLASS.AT.PRESENT", "BUILDING.CLASS.AT.PRESENT", "APARTMENT.NUMBER")
manhattan <- manhattan[ , !(names(manhattan) %in% drops)]
manhattan <- manhattan[manhattan$BUILDING.CLASS.CATEGORY %like% "FAMILY", ]
manhattan <-manhattan[!(manhattan$"SALE.PRICE"== 0),]
replace_commas<-function(x){
x<-as.numeric(gsub("\\,", "", x))
}
manhattan$SALE.PRICE <- replace_commas(manhattan$SALE.PRICE)
manhattan$SALE.PRICE <- as.numeric(manhattan$SALE.PRICE)
# Convert addresses to type character
manhattan$ADDRESS <- as.character(manhattan$ADDRESS)
# Remove anything before a dash, before a slash, and after a comma
manhattan$ADDRESS <- gsub(".*-","", manhattan$ADDRESS)
manhattan$ADDRESS <- gsub(".*/","", manhattan$ADDRESS)
manhattan$ADDRESS <- gsub("(.*),.*", "\\1", manhattan$ADDRESS)
# Remove rows with address lengths less than 3
manhattan$address_length <- c(str_count(manhattan$ADDRESS))
manhattan <- manhattan[!(manhattan$address_length < 3),]
# Add ", Manhattan" to each address
manhattan$ADDRESS <- paste(manhattan$ADDRESS, ", Manhattan", sep="")
geocoded_addresses <- geocode_OSM(manhattan$ADDRESS, as.data.frame = TRUE)
geocoded_addresses <- geocoded_addresses[c("query","lat", "lon")]
head(geocoded_addresses)
write.csv(geocoded_addresses,"geocoded_addresses.csv", row.names = FALSE)
Since running the above chunk took a very long time, I exported the dataframe output as a csv called “geocoded_addresses.csv”, so that I can use the csv whenever I need to re-run the markdown file without having to wait >= 5 minutes.
geocoded_addresses <- read.csv("https://raw.githubusercontent.com/lcao21/GIS3_Final_Project/master/Data/geocoded_addresses.csv")
formattable(head(geocoded_addresses))
| query | lat | lon |
|---|---|---|
| 743 EAST 6TH STREET, Manhattan | 40.72290 | -73.97740 |
| 189 EAST 7TH STREET, Manhattan | 40.72493 | -73.98051 |
| 166 AVENUE A, Manhattan | 40.72811 | -73.98174 |
| 263 EAST 7TH STREET, Manhattan | 40.72363 | -73.97741 |
| 604 EAST 11TH STREET, Manhattan | 40.72712 | -73.97916 |
| 483 WEST 22ND STREET, Manhattan | 40.74700 | -74.00411 |
The geocode_OSM() function couldn’t find a few addresses that aren’t huge losses so I’ll ignore those lost rows and merge the geocoded_addresses df with my manhattan df. In the case, since manhattan has more rows than geocoded_addresses, I conduct an “inner join” with merge():
# Inner join
manhattan <- merge(manhattan, geocoded_addresses, by.x = "ADDRESS", by.y = "query")
# Drop address_length column
manhattan <- manhattan[ , !(names(manhattan) %in% c("address_length"))]
formattable(head(manhattan))
| ADDRESS | NEIGHBORHOOD | BUILDING.CLASS.CATEGORY | BLOCK | LOT | ZIP.CODE | RESIDENTIAL.UNITS | COMMERCIAL.UNITS | TOTAL.UNITS | LAND.SQUARE.FEET | GROSS.SQUARE.FEET | YEAR.BUILT | TAX.CLASS.AT.TIME.OF.SALE | BUILDING.CLASS.AT.TIME.OF.SALE | SALE.PRICE | SALE.DATE | lat | lon |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 SUTTON PLACE, Manhattan | MIDTOWN EAST | 01 ONE FAMILY DWELLINGS | 1372 | 26 | 10022 | 1 | 0 | 1 | 1,434 | 5,339 | 1920 | 1 | A7 | 13000000 | 10/8/19 | 40.75738 | -73.96031 |
| 106 EAST 78TH STREET, Manhattan | UPPER EAST SIDE (59-79) | 01 ONE FAMILY DWELLINGS | 1412 | 68 | 10075 | 1 | 1 | 2 | 1,839 | 4,344 | 1899 | 1 | S1 | 7200000 | 6/10/19 | 40.77457 | -73.96056 |
| 109 WEST 11TH STREET, Manhattan | GREENWICH VILLAGE-WEST | 01 ONE FAMILY DWELLINGS | 607 | 50 | 10011 | 1 | 0 | 1 | 1,665 | 4,060 | 1899 | 1 | A4 | 11200000 | 5/22/19 | 40.73577 | -73.99867 |
| 11 EAST 129 STREET, Manhattan | HARLEM-CENTRAL | 01 ONE FAMILY DWELLINGS | 1754 | 8 | 10035 | 1 | 0 | 1 | 1,081 | 2,800 | 1905 | 1 | A4 | 1550000 | 8/22/19 | 40.75523 | -73.96185 |
| 11 WEST 12TH STREET, Manhattan | GREENWICH VILLAGE-CENTRAL | 01 ONE FAMILY DWELLINGS | 576 | 48 | 10011 | 1 | 0 | 1 | 2,581 | 8,125 | 1910 | 1 | A4 | 19900000 | 6/14/19 | 40.73509 | -73.99506 |
| 110 WASHINGTON PLACE, Manhattan | GREENWICH VILLAGE-WEST | 03 THREE FAMILY DWELLINGS | 592 | 14 | 10014 | 3 | 0 | 3 | 1,575 | 2,646 | 1899 | 1 | C0 | 6800000 | 4/15/19 | 14.27621 | 120.96607 |
An entry has really suspicious lat and lon columns, as they’re both wildly different than the others:
## manhattan[6, ]$ADDRESS manhattan[6, ]$lat manhattan[6, ]$lon
## 1 110 WASHINGTON PLACE, Manhattan 14.27621 120.9661
Using https://www.latlong.net/convert-address-to-lat-long.html, I found the right lat and lon and manually replaced the column values.
manhattan[6,]$lat <- 40.732422
manhattan[6,]$lon <- -74.001152
# Check the min and max of lat and lon that will also be used for mapping
height <- max(manhattan$lat) - min(manhattan$lat)
width <- max(manhattan$lon) - min(manhattan$lon)
borders <- c(bottom = min(manhattan$lat) - 0.2 * height,
top = max(manhattan$lat) + 0.2 * height,
left = min(manhattan$lon) - 0.2 * width,
right = max(manhattan$lon) + 0.2 * width)
borders
## bottom top left right
## 40.67401 40.91134 -74.03133 -73.88918
map <- get_stamenmap(borders, zoom = 11, maptype = "toner-lite")
ggmap(map) +
geom_point(data = manhattan, mapping = aes(x = lon, y = lat,
col = SALE.PRICE)) +
scale_color_gradient(low = "yellow", high = "red")
I’ll be doing more data exploration once I merge the current manhattan df with the 6 other datasets. I just wanted to output one simple map before doing so. It seems that the most expensive homes are located around Central Park (Upper East Side, Upper West Side), Greenwich Village, and SoHo.
schools <- read.csv("https://raw.githubusercontent.com/lcao21/GIS3_Final_Project/master/Data/schools.csv", stringsAsFactors = FALSE)
schools$LATITUDE <- as.numeric(schools$LATITUDE)
## Warning: NAs introduced by coercion
schools$LONGITUDE <- as.numeric(schools$LONGITUDE)
## Warning: NAs introduced by coercion
schools <-schools[!(schools$"LONGITUDE"== 0),]
schools <- schools[with(schools, order(LONGITUDE)),]
m_districts <- c(1,2,3,4,5,6,7,8,9,10)
schools <- schools[schools$Council.district %in% m_districts, ]
I found that filtering for council districts was the most accurate way of finding schools located in Manhattan. I initally tried using community district but the community district scale here is much different than the ones I found on the NYC.gov website. According to https://council.nyc.gov/districts/, Manhattan council districts are from 1 to 10 inclusive, though district 8 represents parts of Manhattan and the Bronx.
Next, I want to determine how far each school is from a home property, but for each property, I only care about schools within a certain proximity. Furthermore, since high schools aren’t subject to school zoning laws, distance betwen home and school is weighed more than for middle and elementary schools, which are assigned based on where students’ live. Although school zoning rules probably affect proprty sales (i.e. there’s a premium on property sales near the best public high school in Manhattan), I assume that more weight is put on attending good high schools rather than middle or elementary schools, since the former imapcts college acceptances and thus its quality is more important to parents and students.
hs_grades <- c("09,10,11,12")
schools <- schools[schools$Grades_text %like% hs_grades, ]
ggmap(map) +
geom_point(data = schools, mapping = aes(x = LONGITUDE, y = LATITUDE))
Now that I have the high schools, I want to now know how many of these schools fall within a buffer of a Manhattan home. So, I have to first create buffers around each home, and I’ve decided to create 1609.34 meter (1 mile) buffers. To do so, I had to convert the manhattan df to an sf object with st_as_sf and also transform the original EPSG:4326 CRS to the 2263 projected coordinate system (NAD83/New York Long Island). I also created a column called home_ID that gives each row (home) a unique number. Ultimately, I get an sf object called manhattan_sf_buff that has two fields: home_ID and geometry; geometry is the buffer polygon.
manhattan_copy <- copy(manhattan)
coordinates(manhattan_copy) <- c("lon", "lat")
proj4string(manhattan_copy) <- CRS("+init=epsg:4326")
manhattan_sf <- st_as_sf(manhattan_copy) %>% st_transform(2263) # nad83
manhattan_sf_buff <- st_buffer(manhattan_sf, 1609.34)
manhattan_sf_buff$home_ID <- seq.int(nrow(manhattan_sf_buff))
manhattan_sf_buff <- subset(manhattan_sf_buff, select=c("home_ID", "geometry"))
formattable(head(as.data.frame(manhattan_sf_buff)))
| home_ID | geometry |
|---|---|
| 1 | POLYGON ((996855.6 215213.3… |
| 2 | POLYGON ((996782.7 221474.6… |
| 3 | POLYGON ((986228.4 207337.2… |
| 4 | POLYGON ((996430 214430.1, … |
| 5 | POLYGON ((987229.7 207089.4… |
| 6 | POLYGON ((985540.1 206117.7… |
I go through a similar process of transforming the schools dataframe of coverting it into an sf object, re-projecting it to the EPSG:2263 projected coordinate system, and creating a column of unique, integer IDs. Ultimately, I get an sf object called schools_sf that has two fields: school_ID and geometry; geometry is of point geometry and represents each school.
schools_copy <- copy(schools)
coordinates(schools_copy) <- c("LONGITUDE", "LATITUDE")
proj4string(schools_copy) <- CRS("+init=epsg:4326")
schools_sf <- st_as_sf(schools_copy) %>% st_transform(2263)
schools_sf$school_ID <- seq.int(nrow(schools_sf))
schools_sf <- subset(schools_sf, select=c("school_ID", "geometry"))
formattable(head(as.data.frame(schools_sf)))
| school_ID | geometry | |
|---|---|---|
| 1079 | 1 | POINT (978997.2 190477.9) |
| 1080 | 2 | POINT (978997.2 190477.9) |
| 757 | 3 | POINT (979410.2 196631.7) |
| 1113 | 4 | POINT (979859.1 196347.8) |
| 1029 | 5 | POINT (980391 200872.7) |
| 911 | 6 | POINT (980561.6 196377.9) |
plot(manhattan_sf_buff$geometry, col="Blue")
plot(st_geometry(schools_sf), pch = 3, col = "Red", add= TRUE)
To calculate the intersections between the homes and high schools, I use the st_intersection() function from the sf library.
intersection <- st_intersection(x = manhattan_sf_buff, y = schools_sf)
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
By plotting the homes and the intersections, we can see a clear decrease in the number of high schools, telling us that only certain schools are within 1609.34 meter buffers of a home.
plot(manhattan_sf_buff$geometry, col="Blue")
plot(intersection, col = "Red", pch = 3, add = TRUE)
## Warning in plot.sf(intersection, col = "Red", pch = 3, add = TRUE): ignoring all
## but the first attribute
int_result <- intersection %>%
group_by(home_ID) %>%
count()
formattable(head(int_result))
| home_ID | n | geometry |
|---|---|---|
| 3 | 2 | POINT (985433.1 207572.8) |
| 5 | 3 | MULTIPOINT ((985433.1 20757… |
| 8 | 1 | POINT (993448.1 229696.9) |
| 10 | 5 | MULTIPOINT ((982544.1 20488… |
| 11 | 4 | MULTIPOINT ((998395 234567…. |
| 12 | 4 | POINT (991316.2 225687.7) |
Now we have to left join the manhattan dataframe with int_result. First, I created the same column of unique, integer ID for manhattan so that I can join the two dataframes with the shared home_ID column. After the merge, homes that don’t have schools within its buffer have NA values in the “n” column so I replace those with zeros and rename the column num_schools. I call this final dataframe manhattan_int_schools.
manhattan$home_ID <- seq.int(nrow(manhattan))
manhattan_int_schools <- merge(x = manhattan, y = int_result, by = "home_ID", all.x = TRUE)
manhattan_int_schools$n[is.na(manhattan_int_schools$n)] <- 0
colnames(manhattan_int_schools)[20] <- "num_schools"
formattable(head(manhattan_int_schools))
| home_ID | ADDRESS | NEIGHBORHOOD | BUILDING.CLASS.CATEGORY | BLOCK | LOT | ZIP.CODE | RESIDENTIAL.UNITS | COMMERCIAL.UNITS | TOTAL.UNITS | LAND.SQUARE.FEET | GROSS.SQUARE.FEET | YEAR.BUILT | TAX.CLASS.AT.TIME.OF.SALE | BUILDING.CLASS.AT.TIME.OF.SALE | SALE.PRICE | SALE.DATE | lat | lon | num_schools | geometry |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 SUTTON PLACE, Manhattan | MIDTOWN EAST | 01 ONE FAMILY DWELLINGS | 1372 | 26 | 10022 | 1 | 0 | 1 | 1,434 | 5,339 | 1920 | 1 | A7 | 13000000 | 10/8/19 | 40.75738 | -73.96031 | 0 | GEOMETRYCOLLECTION EMPTY |
| 2 | 106 EAST 78TH STREET, Manhattan | UPPER EAST SIDE (59-79) | 01 ONE FAMILY DWELLINGS | 1412 | 68 | 10075 | 1 | 1 | 2 | 1,839 | 4,344 | 1899 | 1 | S1 | 7200000 | 6/10/19 | 40.77457 | -73.96056 | 0 | GEOMETRYCOLLECTION EMPTY |
| 3 | 109 WEST 11TH STREET, Manhattan | GREENWICH VILLAGE-WEST | 01 ONE FAMILY DWELLINGS | 607 | 50 | 10011 | 1 | 0 | 1 | 1,665 | 4,060 | 1899 | 1 | A4 | 11200000 | 5/22/19 | 40.73577 | -73.99867 | 2 | POINT (985433.1 207572.8) |
| 4 | 11 EAST 129 STREET, Manhattan | HARLEM-CENTRAL | 01 ONE FAMILY DWELLINGS | 1754 | 8 | 10035 | 1 | 0 | 1 | 1,081 | 2,800 | 1905 | 1 | A4 | 1550000 | 8/22/19 | 40.75523 | -73.96185 | 0 | GEOMETRYCOLLECTION EMPTY |
| 5 | 11 WEST 12TH STREET, Manhattan | GREENWICH VILLAGE-CENTRAL | 01 ONE FAMILY DWELLINGS | 576 | 48 | 10011 | 1 | 0 | 1 | 2,581 | 8,125 | 1910 | 1 | A4 | 19900000 | 6/14/19 | 40.73509 | -73.99506 | 3 | MULTIPOINT ((985433.1 20757… |
| 6 | 110 WASHINGTON PLACE, Manhattan | GREENWICH VILLAGE-WEST | 03 THREE FAMILY DWELLINGS | 592 | 14 | 10014 | 3 | 0 | 3 | 1,575 | 2,646 | 1899 | 1 | C0 | 6800000 | 4/15/19 | 40.73242 | -74.00115 | 0 | GEOMETRYCOLLECTION EMPTY |
# Schools that intersect with homes
schools_that_int <- as.data.frame(st_transform(intersection, crs = 4326))
schools_that_int <- schools_that_int[!duplicated(schools_that_int[,c("school_ID")]),]
schools$school_ID <- seq.int(nrow(schools))
test2 <- merge(schools, schools_that_int)
formattable(head(test2))
| school_ID | fiscal_year | system_code | location_code | location_name | BEDS | Managed_by_name | location_type_description | Location_Category_Description | Grades_text | Grades_final_text | open_date | Status_descriptions | Primary_building_code | primary_address_line_1 | State_code | X_COORDINATE | Y_COORDINATE | LONGITUDE | LATITUDE | Community_district | Council.district | Census_tract | Borough_block_lot | NTA | NTA_Name | Principal_Name | Principal_title | Principal_phone_number | fax_number | Geographical_District_code | Administrative_District_Code | Administrative_District_Name | community_school_sup_name | Tier_3_Support_Location_Name | Tier_3_Support_Leader_Name | Tier_2_Support_Location_Name | HighSchool_Network_Location_Code | HighSchool_Network_Name | HighSchool_Network_Superintendent | Community_district.1 | Police_precinct | home_ID | geometry |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | 2020 | 02M475 | M475 | Stuyvesant High School | 310200011475 | DOE | General Academic | High school | 09,10,11,12 | 09,10,11,12 | 1996-06-05T00:00:00.000 | Open | M477 | 345 CHAMBERS STREET | NY | 980391 | 200873 | -74.01392 | 40.71802 | 101 | 1 | 31703 | 1000160215 | MN25 | Battery Park City-Lower Manhattan | Eric Contreras | Principal | 212-312-4800 | 212-587-3874 | 2 | 2 | NULL | NULL | NYCDOE Borough Office - Manhattan | CHU, YUET | School Support Team 1- Manhattan | HS04 | HS Network 04 Superintendent Office | ORLEN, VIVIAN | 101 | 1 | 130 | POINT (-74.01392 40.71802) |
| 13 | 2020 | 75M721 | M721 | P.S. M721 - Manhattan Occupational Training Center | 307500011721 | DOE | Special Education | Secondary School | 06,07,08,09,10,11,12,SE | 09,10,11,12,SE | 1990-12-04T00:00:00.000 | Open | M641 | 250 WEST HOUSTON STREET | NY | 982544 | 204890 | -74.00616 | 40.72905 | 102 | 3 | 6700 | 1005810054 | MN23 | West Village | SHOLOM FRIED | Principal | 212-675-7926 | 212-255-3227 | 2 | 75 | CITYWIDE SPECIAL EDUCATION | LOUISSAINT, KETLER | D75 CITYWIDE BCO | Tillman Roberto | Children First Network 755 | NULL | NULL | NULL | 102 | 6 | 10 | POINT (-74.00616 40.72905) |
| 14 | 2020 | 75M721 | M721 | P.S. M721 - Manhattan Occupational Training Center | 307500011721 | DOE | Special Education | Secondary School | 06,07,08,09,10,11,12,SE | 09,10,11,12,SE | 1990-12-04T00:00:00.000 | Open | M641 | 250 WEST HOUSTON STREET | NY | 982544 | 204890 | -74.00616 | 40.72905 | 102 | 3 | 6700 | 1005810054 | MN23 | West Village | SHOLOM FRIED | Principal | 212-675-7926 | 212-255-3227 | 2 | 75 | CITYWIDE SPECIAL EDUCATION | LOUISSAINT, KETLER | D75 CITYWIDE BCO | Tillman Roberto | Children First Network 755 | NULL | NULL | NULL | 102 | 6 | 10 | POINT (-74.00616 40.72905) |
| 15 | 2020 | 02M376 | M376 | NYC iSchool | 310200011376 | DOE | General Academic | High school | 09,10,11,12,SE | 09,10,11,12 | 2008-07-01T00:00:00.000 | Open | M615 | 131 AVENUE OF THE AMERICAS | NY | 982848 | 203279 | -74.00506 | 40.72463 | 102 | 3 | 3700 | 1004910016 | MN24 | SoHo-TriBeCa-Civic Center-Little Italy | Isora Bailey | Principal | 917-237-7300 | 212-219-0743 | 2 | 2 | NULL | NULL | NYCDOE Borough Office - Manhattan | CHU, YUET | School Support Team 1- Manhattan | HS04 | HS Network 04 Superintendent Office | ORLEN, VIVIAN | 102 | 1 | 10 | POINT (-74.00506 40.72463) |
| 16 | 2020 | 02M615 | M615 | Chelsea Career and Technical Education High School | 310200011615 | DOE | Career Technical | High school | 09,10,11,12,SE | 09,10,11,12 | 1905-07-01T00:00:00.000 | Open | M615 | 131 AVENUE OF THE AMERICAS | NY | 982848 | 203279 | -74.00506 | 40.72463 | 102 | 3 | 3700 | 1004910016 | MN24 | SoHo-TriBeCa-Civic Center-Little Italy | Jaivelle Reed | Principal | 212-925-1080 | 212-941-7934 | 2 | 2 | NULL | NULL | NYCDOE Borough Office - Manhattan | CHU, YUET | School Support Team 1- Manhattan | HS04 | HS Network 04 Superintendent Office | ORLEN, VIVIAN | 102 | 1 | 10 | POINT (-74.00506 40.72463) |
| 17 | 2020 | 84M522 | M522 | Broome Street Academy Charter School | 310200860992 | Charter | General Academic | High school | 09,10,11,12,SE | 09,10,11,12 | 2011-07-01T00:00:00.000 | Open | MBDF | 121 AVENUE OF THE AMERICAS | NY | 982950 | 203014 | -74.00469 | 40.72391 | 102 | 3 | 3700 | 0 | MN24 | SoHo-TriBeCa-Civic Center-Little Italy | Melissa Silberman | Principal | 212-453-0295 | 212-966-7253 | 2 | 84 | OFFICE OF CHARTER SCHOOLS | NULL | NULL | NULL | NULL | NULL | NULL | NULL | 102 | 1 | 10 | POINT (-74.00469 40.72391) |
# Homes with at least 1 school in its buffer
test3 <- manhattan_int_schools[manhattan_int_schools$num_schools > 0,]
formattable(head(test3))
| home_ID | ADDRESS | NEIGHBORHOOD | BUILDING.CLASS.CATEGORY | BLOCK | LOT | ZIP.CODE | RESIDENTIAL.UNITS | COMMERCIAL.UNITS | TOTAL.UNITS | LAND.SQUARE.FEET | GROSS.SQUARE.FEET | YEAR.BUILT | TAX.CLASS.AT.TIME.OF.SALE | BUILDING.CLASS.AT.TIME.OF.SALE | SALE.PRICE | SALE.DATE | lat | lon | num_schools | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 3 | 109 WEST 11TH STREET, Manhattan | GREENWICH VILLAGE-WEST | 01 ONE FAMILY DWELLINGS | 607 | 50 | 10011 | 1 | 0 | 1 | 1,665 | 4,060 | 1899 | 1 | A4 | 11200000 | 5/22/19 | 40.73577 | -73.99867 | 2 | POINT (985433.1 207572.8) |
| 5 | 5 | 11 WEST 12TH STREET, Manhattan | GREENWICH VILLAGE-CENTRAL | 01 ONE FAMILY DWELLINGS | 576 | 48 | 10011 | 1 | 0 | 1 | 2,581 | 8,125 | 1910 | 1 | A4 | 19900000 | 6/14/19 | 40.73509 | -73.99506 | 3 | MULTIPOINT ((985433.1 20757… |
| 8 | 8 | 115 MANHATTAN AVENUE, Manhattan | MANHATTAN VALLEY | 02 TWO FAMILY DWELLINGS | 1840 | 152 | 10025 | 2 | 0 | 2 | 900 | 3,600 | 1899 | 1 | B1 | 10 | 2/11/20 | 40.79783 | -73.96233 | 1 | POINT (993448.1 229696.9) |
| 10 | 10 | 116 SULLIVAN STREET, Manhattan | SOHO | 02 TWO FAMILY DWELLINGS | 504 | 29 | 10012 | 2 | 0 | 2 | 2,350 | 5,446 | 1900 | 1 | B1 | 13450000 | 3/2/20 | 40.72613 | -74.00286 | 5 | MULTIPOINT ((982544.1 20488… |
| 11 | 11 | 116 WEST 130TH STREET, Manhattan | HARLEM-CENTRAL | 03 THREE FAMILY DWELLINGS | 1914 | 41 | 10027 | 3 | 0 | 3 | 1,665 | 3,476 | 1910 | 1 | C0 | 2300000 | 4/19/19 | 40.81115 | -73.94412 | 4 | MULTIPOINT ((998395 234567…. |
| 12 | 12 | 116 WEST 80TH STREET, Manhattan | UPPER WEST SIDE (79-96) | 01 ONE FAMILY DWELLINGS | 1210 | 142 | 10024 | 1 | 0 | 1 | 1,839 | 5,520 | 1910 | 1 | A4 | 4675000 | 8/2/19 | 40.78288 | -73.97604 | 4 | POINT (991316.2 225687.7) |
map2 <- get_stamenmap(borders, zoom = 11, maptype = "terrain")
ggmap(map2) +
geom_point(data = test2, mapping = aes(x = LONGITUDE, y = LATITUDE), col="Red", shape=3) + # Schools
geom_point(data = test3, mapping = aes(x = lon, y = lat), col="Blue") # Homes
stores <- read.csv("https://raw.githubusercontent.com/lcao21/GIS3_Final_Project/master/Data/healthystores.csv", stringsAsFactors = FALSE)
stores <-stores[!is.na(stores$Longitude), ]
stores <- stores[stores$Borough %like% "New York", ]
After removing stores without latitude and longitude values and aren’t located in the New York borough, I’m left with 68 stores.
ggmap(map) +
geom_point(data = stores, mapping = aes(x = Longitude, y = Latitude))
It seems that the dataset for grocery stores in New York are all clustered in the Upper East Side. My dataset for Manhattan homes, however, is spread throughout Manhattan so I expect that a lot of homes will have zero stores in their buffers. So, I decided to combine stores with data on farmers markets.
The farmers market data, as shown below, is spread out more evently than stores.
farmers <- read.csv("https://raw.githubusercontent.com/lcao21/GIS3_Final_Project/master/Data/farmersmarkets.csv", stringsAsFactors = FALSE)
farmers <-farmers[!is.na(farmers$Longitude), ]
farmers <- farmers[farmers$Borough %like% "Manhattan", ]
ggmap(map) +
geom_point(data = farmers, mapping = aes(x = Longitude, y = Latitude))
The final merged dataframe is called food and has four columns: name, street_address, latitude, and longitude.
food_names <- c(stores$Store.Name, farmers$Market.Name)
stores$street_address <- paste(stores$Street.Address, stores$Address, sep=" ")
food_addresses <- c(stores$street_address, farmers$Street.Address)
food_lat <- c(stores$Latitude, farmers$Latitude)
food_lon <- c(stores$Longitude, farmers$Longitude)
food <- data.frame("name" = food_names, "street_address" = food_addresses, "latitude" = food_lat, "longitude" = food_lon)
ggmap(map) +
geom_point(data = food, mapping = aes(x = longitude, y = latitude))
Since I already created an sf object with 1609.34 meter buffers for each home–manhattan_sf_buff, I can use it again here. Now, I have to transform the food dataframe into an sf object so that I can use st_intersection() between the two.
foods_copy <- copy(food)
coordinates(foods_copy) <- c("longitude", "latitude")
proj4string(foods_copy) <- CRS("+init=epsg:4326")
food_sf <- st_as_sf(foods_copy) %>% st_transform(2263)
food_sf$food_ID <- seq.int(nrow(food_sf))
food_sf <- subset(food_sf, select=c("food_ID", "geometry"))
formattable(head(as.data.frame(food_sf)))
| food_ID | geometry |
|---|---|
| 1 | POINT (999869.1 230322.1) |
| 2 | POINT (1000307 230058.2) |
| 3 | POINT (1000599 228063.3) |
| 4 | POINT (1000611 228095) |
| 5 | POINT (1001628 229567.3) |
| 6 | POINT (1000484 227856.3) |
plot(manhattan_sf_buff$geometry, col="Blue")
plot(st_geometry(food_sf), pch = 3, col = "Green", add= TRUE)
food_intersection <- st_intersection(x = manhattan_sf_buff, y = food_sf)
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
plot(manhattan_sf_buff$geometry, col="Blue")
plot(food_intersection, col = "Green", pch = 3, add = TRUE)
## Warning in plot.sf(food_intersection, col = "Green", pch = 3, add = TRUE):
## ignoring all but the first attribute
food_int_result <- food_intersection %>%
group_by(home_ID) %>%
count()
formattable(head(food_int_result))
| home_ID | n | geometry |
|---|---|---|
| 5 | 1 | POINT (986930.1 207834.2) |
| 8 | 1 | POINT (995422.1 231161.9) |
| 9 | 6 | MULTIPOINT ((997335.9 22679… |
| 11 | 2 | MULTIPOINT ((998555.1 23401… |
| 12 | 1 | POINT (990898 223916.9) |
| 19 | 10 | MULTIPOINT ((997335.9 22679… |
Now I left joined manhattan_int_schools with food_int_result, using home_ID as the mutual column, and renamed the dataframe manhattan_ints because all the dependent variable columns will be in here or eventually merged into it. For rows that have NA values for the column of food vendor counts, I replaced them with zeros.
manhattan_ints <- merge(x = manhattan_int_schools, y = food_int_result, by = "home_ID", all.x = TRUE)
manhattan_ints$n[is.na(manhattan_ints$n)] <- 0
colnames(manhattan_ints)[22] <- "num_food"
formattable(head(manhattan_ints))
| home_ID | ADDRESS | NEIGHBORHOOD | BUILDING.CLASS.CATEGORY | BLOCK | LOT | ZIP.CODE | RESIDENTIAL.UNITS | COMMERCIAL.UNITS | TOTAL.UNITS | LAND.SQUARE.FEET | GROSS.SQUARE.FEET | YEAR.BUILT | TAX.CLASS.AT.TIME.OF.SALE | BUILDING.CLASS.AT.TIME.OF.SALE | SALE.PRICE | SALE.DATE | lat | lon | num_schools | geometry.x | num_food | geometry.y |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 SUTTON PLACE, Manhattan | MIDTOWN EAST | 01 ONE FAMILY DWELLINGS | 1372 | 26 | 10022 | 1 | 0 | 1 | 1,434 | 5,339 | 1920 | 1 | A7 | 13000000 | 10/8/19 | 40.75738 | -73.96031 | 0 | GEOMETRYCOLLECTION EMPTY | 0 | GEOMETRYCOLLECTION EMPTY |
| 2 | 106 EAST 78TH STREET, Manhattan | UPPER EAST SIDE (59-79) | 01 ONE FAMILY DWELLINGS | 1412 | 68 | 10075 | 1 | 1 | 2 | 1,839 | 4,344 | 1899 | 1 | S1 | 7200000 | 6/10/19 | 40.77457 | -73.96056 | 0 | GEOMETRYCOLLECTION EMPTY | 0 | GEOMETRYCOLLECTION EMPTY |
| 3 | 109 WEST 11TH STREET, Manhattan | GREENWICH VILLAGE-WEST | 01 ONE FAMILY DWELLINGS | 607 | 50 | 10011 | 1 | 0 | 1 | 1,665 | 4,060 | 1899 | 1 | A4 | 11200000 | 5/22/19 | 40.73577 | -73.99867 | 2 | POINT (985433.1 207572.8) | 0 | GEOMETRYCOLLECTION EMPTY |
| 4 | 11 EAST 129 STREET, Manhattan | HARLEM-CENTRAL | 01 ONE FAMILY DWELLINGS | 1754 | 8 | 10035 | 1 | 0 | 1 | 1,081 | 2,800 | 1905 | 1 | A4 | 1550000 | 8/22/19 | 40.75523 | -73.96185 | 0 | GEOMETRYCOLLECTION EMPTY | 0 | GEOMETRYCOLLECTION EMPTY |
| 5 | 11 WEST 12TH STREET, Manhattan | GREENWICH VILLAGE-CENTRAL | 01 ONE FAMILY DWELLINGS | 576 | 48 | 10011 | 1 | 0 | 1 | 2,581 | 8,125 | 1910 | 1 | A4 | 19900000 | 6/14/19 | 40.73509 | -73.99506 | 3 | MULTIPOINT ((985433.1 20757… | 1 | POINT (986930.1 207834.2) |
| 6 | 110 WASHINGTON PLACE, Manhattan | GREENWICH VILLAGE-WEST | 03 THREE FAMILY DWELLINGS | 592 | 14 | 10014 | 3 | 0 | 3 | 1,575 | 2,646 | 1899 | 1 | C0 | 6800000 | 4/15/19 | 40.73242 | -74.00115 | 0 | GEOMETRYCOLLECTION EMPTY | 0 | GEOMETRYCOLLECTION EMPTY |
# Food vendors that intersect with homes
food_that_int <- as.data.frame(st_transform(food_intersection, crs = 4326))
food_that_int <- food_that_int[!duplicated(food_that_int[,c("food_ID")]),]
food$food_ID <- seq.int(nrow(food))
food_merged <- merge(food, food_that_int)
formattable(head(food_merged))
| food_ID | name | street_address | latitude | longitude | home_ID | geometry |
|---|---|---|---|---|---|---|
| 1 | Comidas Dominicanas Capellan Deli | 1626 Park Avenue | 40.79884 | -73.94359 | 156 | POINT (-73.94359 40.79884) |
| 5 | Juice Bar Deli | 308 East 116th Street | 40.79677 | -73.93724 | 93 | POINT (-73.93724 40.79677) |
| 9 | El Tepeyac Grocery | 1621 Lexington Avenue | 40.78992 | -73.94796 | 19 | POINT (-73.94796 40.78992) |
| 13 | JJ Food Market | 153 East 99th Street | 40.78769 | -73.94923 | 9 | POINT (-73.94923 40.78769) |
| 14 | Deli Mini Mart | 339 East 115th Street | 40.79565 | -73.93670 | 93 | POINT (-73.9367 40.79565) |
| 15 | Cherry Deli Gourmet | 1575 Lexington Avenue | 40.78874 | -73.94882 | 9 | POINT (-73.94882 40.78874) |
# Homes with at least 1 food vendor in its buffer
homes_with_food <- manhattan_ints[manhattan_ints$num_food > 0,]
formattable(head(homes_with_food))
| home_ID | ADDRESS | NEIGHBORHOOD | BUILDING.CLASS.CATEGORY | BLOCK | LOT | ZIP.CODE | RESIDENTIAL.UNITS | COMMERCIAL.UNITS | TOTAL.UNITS | LAND.SQUARE.FEET | GROSS.SQUARE.FEET | YEAR.BUILT | TAX.CLASS.AT.TIME.OF.SALE | BUILDING.CLASS.AT.TIME.OF.SALE | SALE.PRICE | SALE.DATE | lat | lon | num_schools | geometry.x | num_food | geometry.y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | 5 | 11 WEST 12TH STREET, Manhattan | GREENWICH VILLAGE-CENTRAL | 01 ONE FAMILY DWELLINGS | 576 | 48 | 10011 | 1 | 0 | 1 | 2,581 | 8,125 | 1910 | 1 | A4 | 19900000 | 6/14/19 | 40.73509 | -73.99506 | 3 | MULTIPOINT ((985433.1 20757… | 1 | POINT (986930.1 207834.2) |
| 8 | 8 | 115 MANHATTAN AVENUE, Manhattan | MANHATTAN VALLEY | 02 TWO FAMILY DWELLINGS | 1840 | 152 | 10025 | 2 | 0 | 2 | 900 | 3,600 | 1899 | 1 | B1 | 10 | 2/11/20 | 40.79783 | -73.96233 | 1 | POINT (993448.1 229696.9) | 1 | POINT (995422.1 231161.9) |
| 9 | 9 | 116 EAST 95TH STREET, Manhattan | UPPER EAST SIDE (79-96) | 01 ONE FAMILY DWELLINGS | 1523 | 68 | 10128 | 1 | 0 | 1 | 2,013 | 4,160 | 1899 | 1 | A9 | 8000000 | 3/20/20 | 40.78548 | -73.95259 | 0 | GEOMETRYCOLLECTION EMPTY | 6 | MULTIPOINT ((997335.9 22679… |
| 11 | 11 | 116 WEST 130TH STREET, Manhattan | HARLEM-CENTRAL | 03 THREE FAMILY DWELLINGS | 1914 | 41 | 10027 | 3 | 0 | 3 | 1,665 | 3,476 | 1910 | 1 | C0 | 2300000 | 4/19/19 | 40.81115 | -73.94412 | 4 | MULTIPOINT ((998395 234567…. | 2 | MULTIPOINT ((998555.1 23401… |
| 12 | 12 | 116 WEST 80TH STREET, Manhattan | UPPER WEST SIDE (79-96) | 01 ONE FAMILY DWELLINGS | 1210 | 142 | 10024 | 1 | 0 | 1 | 1,839 | 5,520 | 1910 | 1 | A4 | 4675000 | 8/2/19 | 40.78288 | -73.97604 | 4 | POINT (991316.2 225687.7) | 1 | POINT (990898 223916.9) |
| 19 | 19 | 1255 PARK AVENUE, Manhattan | HARLEM-EAST | 02 TWO FAMILY DWELLINGS | 1625 | 1 | 10029 | 2 | 0 | 2 | 847 | 1,650 | 1920 | 1 | B1 | 906500 | 2/7/20 | 40.78720 | -73.95172 | 0 | GEOMETRYCOLLECTION EMPTY | 10 | MULTIPOINT ((997335.9 22679… |
ggmap(map2) +
geom_point(data = food_merged, mapping = aes(x = longitude, y = latitude), col="Green", shape=3) + # Food
geom_point(data = homes_with_food, mapping = aes(x = lon, y = lat), col="Blue") # Homes
I repeat the whole process again but for the parks dataset this time. Unlike schools, stores, and farmers’ markets, however, parks aren’t represent as points but rather as polygons, and I loaded them in as geojson file.
parks <-geojson_sf("parks.geojson")
parks$park_ID <- seq.int(nrow(parks))
formattable(head(as.data.frame(parks)))
| system | shape_leng | shape_area | parknum | landuse | park_name | sub_code | feat_code | status | source_id | geometry | park_ID |
|---|---|---|---|---|---|---|---|---|---|---|---|
| NA | 829.420106841 | 41539.8013629 | B021 | Community Park | Commodore Barry Park | 491050 | 4910 | Updated | 21491000001 | MULTIPOLYGON (((-73.97922 4… | 1 |
| NA | 607.555193338 | 21960.4949197 | B222 | Neighborhood Park | Pierrepont Playground | 498000 | 4980 | Unchanged | 21498000002 | MULTIPOLYGON (((-73.99732 4… | 2 |
| NA | 784.42199003 | 28079.0762647 | B326 | Neighborhood Park | Cobble Hill Park | 498000 | 4980 | Unchanged | 21498000003 | MULTIPOLYGON (((-73.99557 4… | 3 |
| NA | 2672.71014961 | 438416.114511 | B021 | Community Park | Commodore Barry Park | 498000 | 4980 | Unchanged | 21498000004 | MULTIPOLYGON (((-73.97747 4… | 4 |
| NA | 1376.10639843 | 14465.699799 | B223DG | Triangle/Plaza | Brooklyn Heights Promenade | 498000 | 4980 | Unchanged | 21498000005 | MULTIPOLYGON (((-73.99747 4… | 5 |
| NA | 1218.55982108 | 54510.4377633 | B223K | Parkway | Trinity Park | 498000 | 4980 | Unchanged | 21498000015 | MULTIPOLYGON (((-73.98416 4… | 6 |
There isn’t a column for borough, so I can’t filter for only parks in Manhattan. Nevertheless, this shouldn’t be a problem once I calculate the intersections. First, I have to re-project the parks data.
parks_copy <- copy(parks)
parks_sf <- st_transform(parks_copy, crs = 2263)
plot(manhattan_sf_buff$geometry, col="Blue")
plot(st_geometry(parks_sf), col = "darkseagreen1", add= TRUE)
parks_intersection <- st_intersection(x = manhattan_sf_buff, y = parks_sf)
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
plot(manhattan_sf_buff$geometry, col="Blue")
plot(parks_intersection, col = "darkseagreen1", add = TRUE)
## Warning in plot.sf(parks_intersection, col = "darkseagreen1", add = TRUE):
## ignoring all but the first attribute
parks_int_result <- parks_intersection %>%
group_by(home_ID) %>%
count()
formattable(head(parks_int_result))
| home_ID | n | geometry |
|---|---|---|
| 1 | 18 | MULTIPOLYGON (((994393.8 21… |
| 2 | 14 | MULTIPOLYGON (((994251.7 22… |
| 3 | 20 | MULTIPOLYGON (((985002 2057… |
| 4 | 13 | MULTIPOLYGON (((993976 2134… |
| 5 | 5 | MULTIPOLYGON (((985441.6 20… |
| 6 | 41 | MULTIPOLYGON (((983626.9 20… |
The counts seem really high but if we look at parks_intersection, we see that some of the parks are being counted multiple times for the same parks. This is because the original dataset assigns different shape_area, system, source_id, parknum, and shape_leng columns to some of the same parks. So even after applying the unique() function, I get the same rows.
nrow(as.data.frame(unique(parks_intersection))) == nrow(as.data.frame(parks_intersection))
## [1] TRUE
I add the counts of parks that intersect with home buffers to the main dataframe: manhattan_ints.
manhattan_ints <- merge(x = manhattan_ints, y = parks_int_result, by = "home_ID", all.x = TRUE)
manhattan_ints$n[is.na(manhattan_ints$n)] <- 0
colnames(manhattan_ints)[24] <- "num_parks"
formattable(head(manhattan_ints))
| home_ID | ADDRESS | NEIGHBORHOOD | BUILDING.CLASS.CATEGORY | BLOCK | LOT | ZIP.CODE | RESIDENTIAL.UNITS | COMMERCIAL.UNITS | TOTAL.UNITS | LAND.SQUARE.FEET | GROSS.SQUARE.FEET | YEAR.BUILT | TAX.CLASS.AT.TIME.OF.SALE | BUILDING.CLASS.AT.TIME.OF.SALE | SALE.PRICE | SALE.DATE | lat | lon | num_schools | geometry.x | num_food | geometry.y | num_parks | geometry |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 SUTTON PLACE, Manhattan | MIDTOWN EAST | 01 ONE FAMILY DWELLINGS | 1372 | 26 | 10022 | 1 | 0 | 1 | 1,434 | 5,339 | 1920 | 1 | A7 | 13000000 | 10/8/19 | 40.75738 | -73.96031 | 0 | GEOMETRYCOLLECTION EMPTY | 0 | GEOMETRYCOLLECTION EMPTY | 18 | MULTIPOLYGON (((994393.8 21… |
| 2 | 106 EAST 78TH STREET, Manhattan | UPPER EAST SIDE (59-79) | 01 ONE FAMILY DWELLINGS | 1412 | 68 | 10075 | 1 | 1 | 2 | 1,839 | 4,344 | 1899 | 1 | S1 | 7200000 | 6/10/19 | 40.77457 | -73.96056 | 0 | GEOMETRYCOLLECTION EMPTY | 0 | GEOMETRYCOLLECTION EMPTY | 14 | MULTIPOLYGON (((994251.7 22… |
| 3 | 109 WEST 11TH STREET, Manhattan | GREENWICH VILLAGE-WEST | 01 ONE FAMILY DWELLINGS | 607 | 50 | 10011 | 1 | 0 | 1 | 1,665 | 4,060 | 1899 | 1 | A4 | 11200000 | 5/22/19 | 40.73577 | -73.99867 | 2 | POINT (985433.1 207572.8) | 0 | GEOMETRYCOLLECTION EMPTY | 20 | MULTIPOLYGON (((985002 2057… |
| 4 | 11 EAST 129 STREET, Manhattan | HARLEM-CENTRAL | 01 ONE FAMILY DWELLINGS | 1754 | 8 | 10035 | 1 | 0 | 1 | 1,081 | 2,800 | 1905 | 1 | A4 | 1550000 | 8/22/19 | 40.75523 | -73.96185 | 0 | GEOMETRYCOLLECTION EMPTY | 0 | GEOMETRYCOLLECTION EMPTY | 13 | MULTIPOLYGON (((993976 2134… |
| 5 | 11 WEST 12TH STREET, Manhattan | GREENWICH VILLAGE-CENTRAL | 01 ONE FAMILY DWELLINGS | 576 | 48 | 10011 | 1 | 0 | 1 | 2,581 | 8,125 | 1910 | 1 | A4 | 19900000 | 6/14/19 | 40.73509 | -73.99506 | 3 | MULTIPOINT ((985433.1 20757… | 1 | POINT (986930.1 207834.2) | 5 | MULTIPOLYGON (((985441.6 20… |
| 6 | 110 WASHINGTON PLACE, Manhattan | GREENWICH VILLAGE-WEST | 03 THREE FAMILY DWELLINGS | 592 | 14 | 10014 | 3 | 0 | 3 | 1,575 | 2,646 | 1899 | 1 | C0 | 6800000 | 4/15/19 | 40.73242 | -74.00115 | 0 | GEOMETRYCOLLECTION EMPTY | 0 | GEOMETRYCOLLECTION EMPTY | 41 | MULTIPOLYGON (((983626.9 20… |
Like parks, I read in the data for subway stations as a geojson file.
subway <- geojson_sf("subway.geojson")
subway$subway_ID <- seq.int(nrow(subway))
formattable(head(as.data.frame(subway)))
| notes | url | name | line | objectid | geometry | subway_ID |
|---|---|---|---|---|---|---|
| 4 nights, 6-all times, 6 Express-weekdays AM southbound, PM northbound | http://web.mta.info/nyct/service/ | Astor Pl | 4-6-6 Express | 1 | POINT (-73.99107 40.73005) | 1 |
| 4 nights, 6-all times, 6 Express-weekdays AM southbound, PM northbound | http://web.mta.info/nyct/service/ | Canal St | 4-6-6 Express | 2 | POINT (-74.00019 40.7188) | 2 |
| 1-all times, 2-nights | http://web.mta.info/nyct/service/ | 50th St | 1-2 | 3 | POINT (-73.98385 40.76173) | 3 |
| 4-nights, 3-all other times, 2-all times | http://web.mta.info/nyct/service/ | Bergen St | 2-3-4 | 4 | POINT (-73.975 40.68086) | 4 |
| 4-nights, 3-all other times | http://web.mta.info/nyct/service/ | Pennsylvania Ave | 3-4 | 5 | POINT (-73.89489 40.66471) | 5 |
| 1-all times, exit only northbound | http://web.mta.info/nyct/service/ | 238th St | 1 | 6 | POINT (-73.90087 40.88467) | 6 |
subway_copy <- copy(subway)
subway_sf <- st_transform(subway_copy, crs = 2263)
There isn’t a borough column, so I can’t filter for subway stations only located in Manhattan, but this isn’t an issue.
plot(manhattan_sf_buff$geometry, col="Blue")
plot(st_geometry(subway_sf), col = "orange", pch = 3, add= TRUE)
subway_intersection <- st_intersection(x = manhattan_sf_buff, y = subway_sf)
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
plot(manhattan_sf_buff$geometry, col="Blue")
plot(subway_intersection, col = "orange", pch = 3, add = TRUE)
## Warning in plot.sf(subway_intersection, col = "orange", pch = 3, add = TRUE):
## ignoring all but the first attribute
subway_int_result <- subway_intersection %>%
group_by(home_ID) %>%
count()
formattable(head(subway_int_result))
| home_ID | n | geometry |
|---|---|---|
| 2 | 1 | POINT (995363.9 221130) |
| 3 | 6 | MULTIPOINT ((983444.6 20648… |
| 5 | 5 | MULTIPOINT ((984873.3 20805… |
| 6 | 3 | MULTIPOINT ((983444.6 20648… |
| 7 | 1 | POINT (995363.9 221130) |
| 8 | 2 | MULTIPOINT ((994945.9 22930… |
I add the column of counts of subway stations that intersect with home buffers to the main dataframe: manhattan_ints.
manhattan_ints <- merge(x = manhattan_ints, y = subway_int_result, by = "home_ID", all.x = TRUE)
## Warning in merge.data.frame(x = manhattan_ints, y = subway_int_result, by =
## "home_ID", : column names 'geometry.x', 'geometry.y' are duplicated in the
## result
manhattan_ints$n[is.na(manhattan_ints$n)] <- 0
colnames(manhattan_ints)[26] <- "num_subway"
formattable(head(manhattan_ints))
| home_ID | ADDRESS | NEIGHBORHOOD | BUILDING.CLASS.CATEGORY | BLOCK | LOT | ZIP.CODE | RESIDENTIAL.UNITS | COMMERCIAL.UNITS | TOTAL.UNITS | LAND.SQUARE.FEET | GROSS.SQUARE.FEET | YEAR.BUILT | TAX.CLASS.AT.TIME.OF.SALE | BUILDING.CLASS.AT.TIME.OF.SALE | SALE.PRICE | SALE.DATE | lat | lon | num_schools | geometry.x | num_food | geometry.y | num_parks | num_subway |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 SUTTON PLACE, Manhattan | MIDTOWN EAST | 01 ONE FAMILY DWELLINGS | 1372 | 26 | 10022 | 1 | 0 | 1 | 1,434 | 5,339 | 1920 | 1 | A7 | 13000000 | 10/8/19 | 40.75738 | -73.96031 | 0 | GEOMETRYCOLLECTION EMPTY | 0 | GEOMETRYCOLLECTION EMPTY | 18 | 0 |
| 2 | 106 EAST 78TH STREET, Manhattan | UPPER EAST SIDE (59-79) | 01 ONE FAMILY DWELLINGS | 1412 | 68 | 10075 | 1 | 1 | 2 | 1,839 | 4,344 | 1899 | 1 | S1 | 7200000 | 6/10/19 | 40.77457 | -73.96056 | 0 | GEOMETRYCOLLECTION EMPTY | 0 | GEOMETRYCOLLECTION EMPTY | 14 | 1 |
| 3 | 109 WEST 11TH STREET, Manhattan | GREENWICH VILLAGE-WEST | 01 ONE FAMILY DWELLINGS | 607 | 50 | 10011 | 1 | 0 | 1 | 1,665 | 4,060 | 1899 | 1 | A4 | 11200000 | 5/22/19 | 40.73577 | -73.99867 | 2 | POINT (985433.1 207572.8) | 0 | GEOMETRYCOLLECTION EMPTY | 20 | 6 |
| 4 | 11 EAST 129 STREET, Manhattan | HARLEM-CENTRAL | 01 ONE FAMILY DWELLINGS | 1754 | 8 | 10035 | 1 | 0 | 1 | 1,081 | 2,800 | 1905 | 1 | A4 | 1550000 | 8/22/19 | 40.75523 | -73.96185 | 0 | GEOMETRYCOLLECTION EMPTY | 0 | GEOMETRYCOLLECTION EMPTY | 13 | 0 |
| 5 | 11 WEST 12TH STREET, Manhattan | GREENWICH VILLAGE-CENTRAL | 01 ONE FAMILY DWELLINGS | 576 | 48 | 10011 | 1 | 0 | 1 | 2,581 | 8,125 | 1910 | 1 | A4 | 19900000 | 6/14/19 | 40.73509 | -73.99506 | 3 | MULTIPOINT ((985433.1 20757… | 1 | POINT (986930.1 207834.2) | 5 | 5 |
| 6 | 110 WASHINGTON PLACE, Manhattan | GREENWICH VILLAGE-WEST | 03 THREE FAMILY DWELLINGS | 592 | 14 | 10014 | 3 | 0 | 3 | 1,575 | 2,646 | 1899 | 1 | C0 | 6800000 | 4/15/19 | 40.73242 | -74.00115 | 0 | GEOMETRYCOLLECTION EMPTY | 0 | GEOMETRYCOLLECTION EMPTY | 41 | 3 |
Before I go onto feature selection, I want to look at my “final” dataframe: manhattan_ints.
formattable(head(manhattan_ints))
| home_ID | ADDRESS | NEIGHBORHOOD | BUILDING.CLASS.CATEGORY | BLOCK | LOT | ZIP.CODE | RESIDENTIAL.UNITS | COMMERCIAL.UNITS | TOTAL.UNITS | LAND.SQUARE.FEET | GROSS.SQUARE.FEET | YEAR.BUILT | TAX.CLASS.AT.TIME.OF.SALE | BUILDING.CLASS.AT.TIME.OF.SALE | SALE.PRICE | SALE.DATE | lat | lon | num_schools | geometry.x | num_food | geometry.y | num_parks | num_subway |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 SUTTON PLACE, Manhattan | MIDTOWN EAST | 01 ONE FAMILY DWELLINGS | 1372 | 26 | 10022 | 1 | 0 | 1 | 1,434 | 5,339 | 1920 | 1 | A7 | 13000000 | 10/8/19 | 40.75738 | -73.96031 | 0 | GEOMETRYCOLLECTION EMPTY | 0 | GEOMETRYCOLLECTION EMPTY | 18 | 0 |
| 2 | 106 EAST 78TH STREET, Manhattan | UPPER EAST SIDE (59-79) | 01 ONE FAMILY DWELLINGS | 1412 | 68 | 10075 | 1 | 1 | 2 | 1,839 | 4,344 | 1899 | 1 | S1 | 7200000 | 6/10/19 | 40.77457 | -73.96056 | 0 | GEOMETRYCOLLECTION EMPTY | 0 | GEOMETRYCOLLECTION EMPTY | 14 | 1 |
| 3 | 109 WEST 11TH STREET, Manhattan | GREENWICH VILLAGE-WEST | 01 ONE FAMILY DWELLINGS | 607 | 50 | 10011 | 1 | 0 | 1 | 1,665 | 4,060 | 1899 | 1 | A4 | 11200000 | 5/22/19 | 40.73577 | -73.99867 | 2 | POINT (985433.1 207572.8) | 0 | GEOMETRYCOLLECTION EMPTY | 20 | 6 |
| 4 | 11 EAST 129 STREET, Manhattan | HARLEM-CENTRAL | 01 ONE FAMILY DWELLINGS | 1754 | 8 | 10035 | 1 | 0 | 1 | 1,081 | 2,800 | 1905 | 1 | A4 | 1550000 | 8/22/19 | 40.75523 | -73.96185 | 0 | GEOMETRYCOLLECTION EMPTY | 0 | GEOMETRYCOLLECTION EMPTY | 13 | 0 |
| 5 | 11 WEST 12TH STREET, Manhattan | GREENWICH VILLAGE-CENTRAL | 01 ONE FAMILY DWELLINGS | 576 | 48 | 10011 | 1 | 0 | 1 | 2,581 | 8,125 | 1910 | 1 | A4 | 19900000 | 6/14/19 | 40.73509 | -73.99506 | 3 | MULTIPOINT ((985433.1 20757… | 1 | POINT (986930.1 207834.2) | 5 | 5 |
| 6 | 110 WASHINGTON PLACE, Manhattan | GREENWICH VILLAGE-WEST | 03 THREE FAMILY DWELLINGS | 592 | 14 | 10014 | 3 | 0 | 3 | 1,575 | 2,646 | 1899 | 1 | C0 | 6800000 | 4/15/19 | 40.73242 | -74.00115 | 0 | GEOMETRYCOLLECTION EMPTY | 0 | GEOMETRYCOLLECTION EMPTY | 41 | 3 |
I don’t need the geometry columns so I drop them and am left with the columns I care about. Specifically, all my variables come from this dataframe:
Dependent Variable: SALE.PRICE Independent Variables: num_schools, num_food, num_parks, num_subway, RESIDENTIAL.UNITS, COMMERCIAL.UNITS, LAND.SQUARE.FEET, GROSS.SQUARE.FEET, NEIGHBORHOOD
manhattan_ints <- manhattan_ints[ , !(names(manhattan_ints) %like% "geometry")]
I have to transform the NEIGHBORHOOD column into an ordinal, categorical variable and will do so by ranking the neighborhoods by income. This is because the high median income in a neighborhood typically indicates high property prices and thus rich neighborhoods have a premium on home prices.
unique(manhattan_ints$NEIGHBORHOOD)
## [1] "MIDTOWN EAST" "UPPER EAST SIDE (59-79)"
## [3] "GREENWICH VILLAGE-WEST" "HARLEM-CENTRAL"
## [5] "GREENWICH VILLAGE-CENTRAL" "MANHATTAN VALLEY"
## [7] "UPPER EAST SIDE (79-96)" "SOHO"
## [9] "UPPER WEST SIDE (79-96)" "INWOOD"
## [11] "EAST VILLAGE" "HARLEM-EAST"
## [13] "GRAMERCY" "UPPER WEST SIDE (59-79)"
## [15] "WASHINGTON HEIGHTS LOWER" "MURRAY HILL"
## [17] "CHELSEA" "TRIBECA"
## [19] "SOUTHBRIDGE" "ALPHABET CITY"
## [21] "WASHINGTON HEIGHTS UPPER" "KIPS BAY"
## [23] "HARLEM-UPPER"
According to https://ny.curbed.com/2017/8/4/16099252/new-york-neighborhood-affordability, the neighborhoods with the highest to lowest median household income are: 1) Upper East Side (59-79); Upper East Side (79-96) 2) Soho; Tribeca 3) Midtown East 4) Greenwich Village Central; Greenwich Village West 5) Chelsea 6) Gramercy 7) Murray Hill; Kips Bay 8) Upper West Side (59-79); Upper West Side (79-96); Manhattan Valley 9) East Village; Alphabet City 10) Washington Heights Upper 11) Harlem Central; Harlem Upper 12) Inwood 13) Washington Heights Lower 14) Southbridge 15) Harlem East
As such, I’ll assign each level a number, with 15 being the highest income and 1 being the lowest income (the reverse of the above list).
manhattan_ints$neighborhood_ord <-
ifelse(manhattan_ints$NEIGHBORHOOD %like% "UPPER EAST SIDE", 15,
ifelse(manhattan_ints$NEIGHBORHOOD %in% c("SOHO", "TRIBECA"), 14,
ifelse(manhattan_ints$NEIGHBORHOOD %like% "MIDTOWN EAST", 13,
ifelse(manhattan_ints$NEIGHBORHOOD %like% "GREENWICH VILLAGE-", 12,
ifelse(manhattan_ints$NEIGHBORHOOD %like% "CHELSEA", 11,
ifelse(manhattan_ints$NEIGHBORHOOD %like% "GRAMERCY", 10,
ifelse(manhattan_ints$NEIGHBORHOOD %in% c("MURRAY HILL", "KIPS BAY"), 9,
ifelse(manhattan_ints$NEIGHBORHOOD %in% c("UPPER WEST SIDE (59-79)", "UPPER WEST SIDE (79-96)", "MANHATTAN VALLEY"), 8,
ifelse(manhattan_ints$NEIGHBORHOOD %in% c("EAST VILLAGE", "ALPHABET CITY"), 7,
ifelse(manhattan_ints$NEIGHBORHOOD %like% "WASHINGTON HEIGHTS UPPER", 6,
ifelse(manhattan_ints$NEIGHBORHOOD %in% c("HARLEM-CENTRAL", "HARLEM-UPPER"), 5,
ifelse(manhattan_ints$NEIGHBORHOOD %like% "INWOOD", 4,
ifelse(manhattan_ints$NEIGHBORHOOD %like% "WASHINGTON HEIGHTS LOWER", 3,
ifelse(manhattan_ints$NEIGHBORHOOD %like% "SOUTHBRIDGE", 2,
ifelse(manhattan_ints$NEIGHBORHOOD %like% "HARLEM-EAST", 1, 0)))))))))))))))
formattable(head(manhattan_ints))
| home_ID | ADDRESS | NEIGHBORHOOD | BUILDING.CLASS.CATEGORY | BLOCK | LOT | ZIP.CODE | RESIDENTIAL.UNITS | COMMERCIAL.UNITS | TOTAL.UNITS | LAND.SQUARE.FEET | GROSS.SQUARE.FEET | YEAR.BUILT | TAX.CLASS.AT.TIME.OF.SALE | BUILDING.CLASS.AT.TIME.OF.SALE | SALE.PRICE | SALE.DATE | lat | lon | num_schools | num_food | num_parks | num_subway | neighborhood_ord |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 SUTTON PLACE, Manhattan | MIDTOWN EAST | 01 ONE FAMILY DWELLINGS | 1372 | 26 | 10022 | 1 | 0 | 1 | 1,434 | 5,339 | 1920 | 1 | A7 | 13000000 | 10/8/19 | 40.75738 | -73.96031 | 0 | 0 | 18 | 0 | 13 |
| 2 | 106 EAST 78TH STREET, Manhattan | UPPER EAST SIDE (59-79) | 01 ONE FAMILY DWELLINGS | 1412 | 68 | 10075 | 1 | 1 | 2 | 1,839 | 4,344 | 1899 | 1 | S1 | 7200000 | 6/10/19 | 40.77457 | -73.96056 | 0 | 0 | 14 | 1 | 15 |
| 3 | 109 WEST 11TH STREET, Manhattan | GREENWICH VILLAGE-WEST | 01 ONE FAMILY DWELLINGS | 607 | 50 | 10011 | 1 | 0 | 1 | 1,665 | 4,060 | 1899 | 1 | A4 | 11200000 | 5/22/19 | 40.73577 | -73.99867 | 2 | 0 | 20 | 6 | 12 |
| 4 | 11 EAST 129 STREET, Manhattan | HARLEM-CENTRAL | 01 ONE FAMILY DWELLINGS | 1754 | 8 | 10035 | 1 | 0 | 1 | 1,081 | 2,800 | 1905 | 1 | A4 | 1550000 | 8/22/19 | 40.75523 | -73.96185 | 0 | 0 | 13 | 0 | 5 |
| 5 | 11 WEST 12TH STREET, Manhattan | GREENWICH VILLAGE-CENTRAL | 01 ONE FAMILY DWELLINGS | 576 | 48 | 10011 | 1 | 0 | 1 | 2,581 | 8,125 | 1910 | 1 | A4 | 19900000 | 6/14/19 | 40.73509 | -73.99506 | 3 | 1 | 5 | 5 | 12 |
| 6 | 110 WASHINGTON PLACE, Manhattan | GREENWICH VILLAGE-WEST | 03 THREE FAMILY DWELLINGS | 592 | 14 | 10014 | 3 | 0 | 3 | 1,575 | 2,646 | 1899 | 1 | C0 | 6800000 | 4/15/19 | 40.73242 | -74.00115 | 0 | 0 | 41 | 3 | 12 |
So now,
Dependent Variable: SALE.PRICE
Independent Variables: num_schools, num_food, num_parks, num_subway, RESIDENTIAL.UNITS, COMMERCIAL.UNITS, LAND.SQUARE.FEET, GROSS.SQUARE.FEET, neighborhood_ord
Next, I have to perform feature selection and will show my steps in the next document.